A nationwide survey of hospital costs conducted by the US Agency for Healthcare consists of hospital records of inpatient samples. The given data is restricted to the city of Wisconsin and relates to patients in the age group 0-17 years. The agency wants to analyze the data to research on healthcare costs and their utilization.
Healthcare
AGE = Age of the patient discharged
FEMALE = A binary variable that indicates if the patient is female
LOS = Length of stay in days
RACE = Race of the patient (specified numerically)
TOTCHG = Hospital discharge costs
APRDRG = All Patient Refined Diagnosis Related Groups
setwd("C:/Users/lorad/Documents/Projects/Personal_R")
library(readxl)
hospitalcosts <- read_excel("1555054100_hospitalcosts.xlsx",
col_types = c("numeric", "numeric", "numeric",
"numeric", "numeric", "numeric"))
head(hospitalcosts)
## # A tibble: 6 x 6
## AGE FEMALE LOS RACE TOTCHG APRDRG
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 17 1 2 1 2660 560
## 2 17 0 2 1 1689 753
## 3 17 1 7 1 20060 930
## 4 17 1 1 1 736 758
## 5 17 1 1 1 1194 754
## 6 17 0 0 1 3305 347
summary(hospitalcosts)
## AGE FEMALE LOS RACE
## Min. : 0.000 Min. :0.000 Min. : 0.000 Min. :1.000
## 1st Qu.: 0.000 1st Qu.:0.000 1st Qu.: 2.000 1st Qu.:1.000
## Median : 0.000 Median :1.000 Median : 2.000 Median :1.000
## Mean : 5.086 Mean :0.512 Mean : 2.828 Mean :1.078
## 3rd Qu.:13.000 3rd Qu.:1.000 3rd Qu.: 3.000 3rd Qu.:1.000
## Max. :17.000 Max. :1.000 Max. :41.000 Max. :6.000
## NA's :1
## TOTCHG APRDRG
## Min. : 532 Min. : 21.0
## 1st Qu.: 1216 1st Qu.:640.0
## Median : 1536 Median :640.0
## Mean : 2774 Mean :616.4
## 3rd Qu.: 2530 3rd Qu.:751.0
## Max. :48388 Max. :952.0
##
#checkpoint1 -> finding NA values
which(is.na(hospitalcosts), arr.ind=TRUE)
## row col
## [1,] 277 4
library(dplyr)
library(tidyr)
hospitalcosts_NoNA <- hospitalcosts %>% mutate(across(`RACE`, ~replace_na(., round(median(., na.rm=TRUE),2))))
# unique(hospitalcosts_NoNA$RACE)
# which(is.na(hospitalcosts_NoNA), arr.ind=TRUE)
#creating age group
hospitalcosts_NoNA_AgeGroup <- hospitalcosts_NoNA %>% mutate(age_group = case_when(AGE < 5~"<5",
AGE >= 5 & AGE < 10 ~"5-9",
AGE >= 10 & AGE < 15 ~"10-14",
AGE >= 15 ~ ">=15"))
# unique(hospitalcosts_NoNA_AgeGroup$age_group)
hist(hospitalcosts_NoNA$AGE, main="Histogram of Age Group and their hospital visits",
xlab="Age group", border="black", col=c("light blue", "dark blue"), xlim=c(0,20), ylim=c(0,350))
summary(as.factor(hospitalcosts_NoNA_AgeGroup$age_group))
## <5 >=15 10-14 5-9
## 323 96 70 11
library(ggplot2)
library(plotly)
p1<-ggplot(hospitalcosts_NoNA_AgeGroup, aes(x=age_group, y=TOTCHG,color= age_group)) +
geom_boxplot()+
labs(title="Plot of Hospital discharge costs per Age Group",x="Age Group", y = "Hospital Discharge Costs")
ggplotly(p1)
#Summary of Hospital discharge per Age Group
tapply(hospitalcosts_NoNA_AgeGroup$TOTCHG, hospitalcosts_NoNA_AgeGroup$age_group, summary)
## $`<5`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 550 1266 1517 2383 2081 29188
##
## $`>=15`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 532 1153 1854 3705 3148 48388
##
## $`10-14`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 548.0 924.8 1334.5 2705.8 2908.8 17524.0
##
## $`5-9`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1132 3059 7923 6583 10046 10585
hospitalcosts_AgeGenderFactor<-hospitalcosts_NoNA_AgeGroup %>%
mutate(GENDER=recode(FEMALE, '1'='FEMALE', '0'='MALE')) %>%
mutate_at(vars("age_group","GENDER"), as.factor)
meta<-hospitalcosts_AgeGenderFactor %>%
group_by(age_group,GENDER) %>%
summarise(TOTCHG = sum(TOTCHG), LOS = sum(LOS))
print(meta)
## # A tibble: 8 x 4
## # Groups: age_group [4]
## age_group GENDER TOTCHG LOS
## <fct> <fct> <dbl> <dbl>
## 1 <5 FEMALE 322491 458
## 2 <5 MALE 447211 517
## 3 >=15 FEMALE 204169 159
## 4 >=15 MALE 151504 86
## 5 10-14 FEMALE 114559 135
## 6 10-14 MALE 74850 41
## 7 5-9 FEMALE 10584 2
## 8 5-9 MALE 61826 16
AgeGenderInfluence=lm(TOTCHG~ AGE + GENDER, data=hospitalcosts_AgeGenderFactor)
summary(AgeGenderInfluence)
##
## Call:
## lm(formula = TOTCHG ~ AGE + GENDER, data = hospitalcosts_AgeGenderFactor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3406 -1443 -869 -152 44951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1970.44 294.35 6.694 5.87e-11 ***
## AGE 86.28 25.48 3.387 0.000763 ***
## GENDERMALE 748.19 353.83 2.115 0.034967 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3845 on 497 degrees of freedom
## Multiple R-squared: 0.0261, Adjusted R-squared: 0.02218
## F-statistic: 6.66 on 2 and 497 DF, p-value: 0.001399
hospitalcosts_AgeGenderFactor<-hospitalcosts_AgeGenderFactor %>%
mutate_at(vars("RACE"), as.factor)
AgeGenderRaceInfluence=lm(LOS~ AGE + GENDER+ RACE, data=hospitalcosts_AgeGenderFactor)
summary(AgeGenderRaceInfluence)
##
## Call:
## lm(formula = LOS ~ AGE + GENDER + RACE, data = hospitalcosts_AgeGenderFactor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.204 -1.204 -0.856 0.144 37.796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.20362 0.25930 12.355 <2e-16 ***
## AGE -0.03902 0.02254 -1.731 0.084 .
## GENDERMALE -0.34799 0.31221 -1.115 0.266
## RACE2 -0.37573 1.39444 -0.269 0.788
## RACE3 0.79638 3.38275 0.235 0.814
## RACE4 0.59690 1.95542 0.305 0.760
## RACE5 -0.85563 1.96098 -0.436 0.663
## RACE6 -0.71745 2.39082 -0.300 0.764
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.373 on 492 degrees of freedom
## Multiple R-squared: 0.008562, Adjusted R-squared: -0.005544
## F-statistic: 0.607 on 7 and 492 DF, p-value: 0.7503
hospitalcosts_AgeGenderFactor<-hospitalcosts_AgeGenderFactor %>%
mutate_at(vars("RACE"), as.factor)
allInfluence=lm(TOTCHG~ ., data=hospitalcosts_AgeGenderFactor)
summary(allInfluence)
##
## Call:
## lm(formula = TOTCHG ~ ., data = hospitalcosts_AgeGenderFactor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6484 -632 -135 142 42904
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4735.6923 472.7590 10.017 < 2e-16 ***
## AGE 448.3289 162.7154 2.755 0.00608 **
## FEMALE -392.0562 248.3863 -1.578 0.11512
## LOS 744.6871 34.9032 21.336 < 2e-16 ***
## RACE2 431.9519 1086.2886 0.398 0.69107
## RACE3 334.7421 2616.0844 0.128 0.89824
## RACE4 -447.1070 1524.2238 -0.293 0.76939
## RACE5 -1651.9104 1526.6784 -1.082 0.27978
## RACE6 -649.7641 1851.5338 -0.351 0.72579
## APRDRG -7.3740 0.7306 -10.093 < 2e-16 ***
## age_group>=15 -4976.5708 2622.0613 -1.898 0.05829 .
## age_group10-14 -4383.7035 2085.4692 -2.102 0.03606 *
## age_group5-9 -1052.0793 1412.0320 -0.745 0.45658
## GENDERMALE NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2608 on 487 degrees of freedom
## Multiple R-squared: 0.5609, Adjusted R-squared: 0.5501
## F-statistic: 51.84 on 12 and 487 DF, p-value: < 2.2e-16
# removing the variable RACE and GENDER in the model
LOSageAPRDRGInfluence=lm(TOTCHG~ LOS+AGE+APRDRG, data=hospitalcosts_AgeGenderFactor)
summary(LOSageAPRDRGInfluence)
##
## Call:
## lm(formula = TOTCHG ~ LOS + AGE + APRDRG, data = hospitalcosts_AgeGenderFactor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6603 -718 -169 123 43350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4959.8572 433.1927 11.450 < 2e-16 ***
## LOS 740.8349 34.8778 21.241 < 2e-16 ***
## AGE 128.5889 17.0670 7.534 2.34e-13 ***
## APRDRG -8.0060 0.6636 -12.065 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2614 on 496 degrees of freedom
## Multiple R-squared: 0.5508, Adjusted R-squared: 0.5481
## F-statistic: 202.7 on 3 and 496 DF, p-value: < 2.2e-16